Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30112
## Number of samples: 80 (45 ASD, 35 controls)

Filtering DE probes

Significance criteria: adjusted p-value<0.05 and abs(lfc)<log2(1.2) (absolute value because we care both for the under and overexpressed genes)

datExpr = datExpr[DE_info$padj<0.05 & abs(DE_info$log2FoldChange)>log2(1.2),]
print(paste0(nrow(datExpr), ' probes left.'))
## [1] "1055 probes left."

Dimensionality reduction using PCA

Since there are more genes than samples, we can perform PCA and reduce the dimension from 30K to 80 without losing any information

pca = prcomp(t(datExpr))
datExpr_redDim = pca$x %>% data.frame

Clustering Methods

clusterings = list()



K-means clustering

No recognisable best k, so chose k=6

set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 6
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster



Hierarchical Clustering

Chose k=9 as best number of clusters.

h_clusts = datExpr_redDim %>% dist %>% hclust %>% as.dendrogram
h_clusts %>% plot
abline(h=39, col='blue')

best_k = 8

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Colors:

  • Diagnosis: Blue=control, Green=ASD

  • Sex: Pink=Female, Blue=Male

  • Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital

  • Age: Purple=youngest, Yellow=oldest

clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>% 
            mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'),                # Pink Female, Blue Male
                   'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 4 colours
                                        Brain_lobe=='Temporal'~'#7CAE00',
                                        Brain_lobe=='Parietal'~'#00BFC4',
                                        Brain_lobe=='Occipital'~'#C77CFF'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%            # Purple: young, Yellow: old
            dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)



Consensus Clustering

Chose the best clustering to be with k=5

*The rest of the output plots can be found in the Data/Gandal/consensusClustering/samples_DE_probes folder



Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance (The 2 first components explain over 60% of the variance, so decided to keep the first 37 to accumulate 90% of the variance)

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign obs to genes with FDR<0.01 using the fdrtool package


It’s not supposed to be a good method for clustering samples because ICA does not perform well with small samples (see Figure 4 of this paper)

Still, it leaves only 5 samples without a cluster

ICA_clusters %>% rowSums %>% table
## .
##  0  1  2  3  4  5  6  7  8 
##  5 32 14  9 12  4  2  1  1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()



WGCNA

This method doesn’t work well:

Using power=5 as it is the first power that exceed the 0.85 threshold

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = 1:30)
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.3020  0.403         0.6280 24.9000  2.73e+01 38.600
## 2      2   0.2140 -0.229         0.0855 11.5000  1.16e+01 24.100
## 3      3   0.5790 -0.508         0.4590  6.2000  5.49e+00 16.400
## 4      4   0.7520 -0.705         0.6860  3.6800  2.77e+00 11.800
## 5      5   0.9150 -0.792         0.9190  2.3200  1.47e+00  8.780
## 6      6   0.9690 -0.919         0.9660  1.5400  8.02e-01  6.720
## 7      7   0.8810 -0.996         0.8740  1.0600  5.22e-01  5.270
## 8      8   0.1650 -2.170        -0.0512  0.7520  3.48e-01  4.210
## 9      9   0.1730 -2.210        -0.0502  0.5510  2.31e-01  3.420
## 10    10   0.8990 -1.090         0.8830  0.4150  1.53e-01  2.820
## 11    11   0.8660 -1.120         0.8520  0.3200  9.83e-02  2.360
## 12    12   0.8640 -1.040         0.8920  0.2530  6.16e-02  2.000
## 13    13   0.1610 -2.930        -0.0410  0.2050  3.89e-02  1.720
## 14    14   0.1590 -2.980        -0.0282  0.1690  2.56e-02  1.490
## 15    15   0.0724 -1.970         0.1410  0.1420  1.68e-02  1.300
## 16    16   0.1260 -2.800         0.1640  0.1220  1.10e-02  1.160
## 17    17   0.0748 -1.900         0.1760  0.1060  7.49e-03  1.090
## 18    18   0.0416 -1.480         0.4140  0.0937  5.06e-03  1.040
## 19    19   0.0461 -1.490         0.3990  0.0836  3.38e-03  0.985
## 20    20   0.0575 -1.580         0.3940  0.0755  2.28e-03  0.938
## 21    21   0.0123 -0.763         0.7130  0.0687  1.54e-03  0.895
## 22    22   0.0630 -1.650         0.4320  0.0631  1.07e-03  0.855
## 23    23   0.0715 -1.690         0.4130  0.0583  7.55e-04  0.818
## 24    24   0.0387 -1.190         0.3330  0.0541  5.35e-04  0.783
## 25    25   0.0434 -1.210         0.3310  0.0505  3.81e-04  0.750
## 26    26   0.0486 -1.240         0.3180  0.0474  2.70e-04  0.719
## 27    27   0.0537 -1.260         0.3070  0.0445  1.88e-04  0.690
## 28    28   0.0590 -1.270         0.3120  0.0420  1.31e-04  0.661
## 29    29   0.0634 -1.280         0.3050  0.0397  9.16e-05  0.635
## 30    30   0.0675 -1.280         0.2990  0.0376  6.41e-05  0.609

Running WGCNA with power=5

# Best power
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 0, 1
##       ..there is nothing to merge.

Cluster distribution

table(network$colors)
## 
##  0  1 
## 18 62
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

Running WGCNA with power=10, leaves out even more genes and still only finds 1 cluster

# 2nd best
network = datExpr_redDim %>% t %>% blockwiseModules(power=10, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 0, 1
##       ..there is nothing to merge.

Cluster distribution

table(network$colors)
## 
##  0  1 
## 43 37

Using the original expression dataset, the \(R^2\) threshold is never achieved, getting closest at power 1, but still doesn’t manage to find any clusters within the data

best_power = datExpr %>% pickSoftThreshold(powerVector = 1:30)
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      1 0.664000 18.6000        0.78300   71.20     72.30   73.6
## 2      2 0.663000  9.2200        0.76000   64.30     66.20   68.8
## 3      3 0.080100 10.8000       -0.01770   58.30     60.80   64.4
## 4      4 0.070700  7.5500       -0.00744   53.00     56.00   60.5
## 5      5 0.616000  3.1100        0.76000   48.30     51.60   56.8
## 6      6 0.597000  2.5500        0.70100   44.20     47.70   53.5
## 7      7 0.580000  2.0700        0.73300   40.50     44.20   50.5
## 8      8 0.552000  1.8300        0.63300   37.20     41.00   47.6
## 9      9 0.107000  4.4700       -0.03060   34.30     38.10   45.0
## 10    10 0.382000  1.2000        0.40000   31.60     35.50   42.6
## 11    11 0.340000  1.0400        0.30100   29.20     33.00   40.4
## 12    12 0.362000  0.9000        0.37900   27.00     30.70   38.3
## 13    13 0.401000  0.7860        0.42700   25.00     28.50   36.4
## 14    14 0.362000  0.7360        0.32400   23.20     26.50   34.5
## 15    15 0.330000  0.6580        0.27100   21.60     24.70   32.8
## 16    16 0.194000  0.5290        0.09890   20.10     23.10   31.2
## 17    17 0.150000  0.5420        0.01710   18.70     21.60   29.7
## 18    18 0.120000  0.3700        0.01690   17.50     20.20   28.3
## 19    19 0.043500  0.2720       -0.06120   16.30     18.90   27.0
## 20    20 0.028600  0.1530       -0.20000   15.30     17.70   25.7
## 21    21 0.011900  0.0920       -0.22900   14.30     16.50   24.6
## 22    22 0.001150  0.0315       -0.28200   13.40     15.50   23.4
## 23    23 0.000775  0.0240       -0.28500   12.60     14.50   22.4
## 24    24 0.000578  0.0195       -0.28100   11.80     13.60   21.4
## 25    25 0.000829  0.0225       -0.27700   11.10     12.70   20.4
## 26    26 0.002890 -0.0462       -0.28200   10.40     11.90   19.5
## 27    27 0.015600 -0.0979       -0.25800    9.78     11.20   18.7
## 28    28 0.043000 -0.1710       -0.23000    9.20     10.50   17.9
## 29    29 0.060100 -0.2080       -0.20800    8.67      9.83   17.1
## 30    30 0.082400 -0.2300       -0.17400    8.17      9.23   16.4

Running WGCNA with power=1

# Best power
network = datExpr %>% blockwiseModules(power=1, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 1
##       ..there is nothing to merge.

Cluster distribution

table(network$colors)
## 
##  1 
## 80



Gaussian Mixture Models with hard thresholding

Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 groups following the best k from K-means because the methods are similar

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 6 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, best_k, ICA_output, ICA_clusters_names, 
   signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta, 
   best_power, c, viridis_age_cols, create_viridis_dict, pca)



Compare clusterings

Using Adjusted Rand Index:

clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['Subject']] = datMeta$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta$Diagnosis_
clusters_plus_phenotype[['Region']] = datMeta$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta$Sex
clusters_plus_phenotype[['Age']] = datMeta$Age

cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
  cluster1 = as.factor(clusters_plus_phenotype[[i]])
  for(j in (i):length(clusters_plus_phenotype)){
    cluster2 = as.factor(clusters_plus_phenotype[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)



Scatter plots

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                            Subject_ID = datMeta$Subject_ID,
                     KMeans = as.factor(clusterings[['km']]),     Hierarchical = as.factor(clusterings[['hc']]),
                     Consensus = as.factor(clusterings[['cc']]),  ICA = as.factor(clusterings[['ICA_min']]),
                     GMM = as.factor(clusterings[['GMM']]),       WGCNA = as.factor(clusterings[['WGCNA']]),
                     Sex = as.factor(datMeta$Sex),                Region = as.factor(datMeta$Brain_lobe), 
                     Diagnosis = as.factor(datMeta$Diagnosis_),   Age = datMeta$Age)

Now, PC1 seems to separate samples by Diagnosis relatively well

selectable_scatter_plot(plot_points, plot_points)